{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using MLRun with a remote Spark service"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build a simple read CSV function using Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/conda/bin/python\n",
    "\n",
    "import mlrun\n",
    "from mlrun.datastore import DataItem\n",
    "from mlrun.execution import MLClientCtx\n",
    "\n",
    "from pyspark.sql import SparkSession\n",
    "\n",
    "\n",
    "def describe_spark(context: MLClientCtx, dataset: DataItem, artifact_path):\n",
    "\n",
    "    # get file location\n",
    "    location = dataset.local()\n",
    "\n",
    "    # build spark session\n",
    "    spark = SparkSession.builder.appName(\"Spark job\").getOrCreate()\n",
    "\n",
    "    # read csv\n",
    "    df = spark.read.csv(location, header=True, inferSchema=True)\n",
    "\n",
    "    # show\n",
    "    df.show(5)\n",
    "\n",
    "    # sample for logging\n",
    "    df_to_log = df.sample(False, 0.1).toPandas()\n",
    "\n",
    "    # log final report\n",
    "    context.log_dataset(\n",
    "        \"df_sample\",\n",
    "        df=df_to_log,\n",
    "        format=\"csv\",\n",
    "        index=False,\n",
    "        artifact_path=context.artifact_subpath(\"data\"),\n",
    "    )\n",
    "\n",
    "    spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mlrun: end-code"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a remote-spark MLRun function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = mlrun.code_to_function(handler=\"describe_spark\", kind=\"remote-spark\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn.with_spark_service(spark_service=\"iguazio-spark-service-name\")\n",
    "fn.deploy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn.run(inputs={\"dataset\": \"iris_dataset.csv\"}, artifact_path=\"/User\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
